data_orig <- fread(here::here("data", "candy-data_edit_2021.csv"), stringsAsFactors = FALSE)
data <- data_orig
characteristica <- "chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus"
Note that we added producer manually.
Remove one dime and one quarter as they do not belong to the original distribution and might change regression results.
data <- data[data$competitorname %ni% c("One dime", "One quarter")]
Because we deleted data, we have to readjust percentile rangs of pricepercent and sugarpercent.
# define function
percentile_ranked <- function(a_vector, value) {
length(sort(a_vector)[a_vector < value]) / length(a_vector)
}
b <- data$sugarpercent
data$sugarpercent <- sapply(data$sugarpercent, percentile_ranked, a_vector = b)
b <- data$pricepercent
data$pricepercent <- sapply(data$pricepercent, percentile_ranked, a_vector = b)
We checked the sugar content manually and is not plausible (Reese Miniatures, Nestle Crunch, Skittles Wild Berry, Milky Way Simply Caramel). Hence, we build a robust data set without the variable sugarpercent.
robust <- data
robust$sugarpercent <- NULL
Next, we set a numerical subset for further use.
data_num <- subset(data, select = -c(competitorname, producer))
robust_num <- subset(robust, select = -c(competitorname, producer))
data %>% missing_plot()
No missing values.
kable(describe(data, omit = TRUE), caption = "Summary statistics", digits = 3)
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| chocolate | 3 | 83 | 0.446 | 0.500 | 0.000 | 0.433 | 0.000 | 0.000 | 1.000 | 1.000 | 0.214 | -1.978 | 0.055 |
| fruity | 4 | 83 | 0.458 | 0.501 | 0.000 | 0.448 | 0.000 | 0.000 | 1.000 | 1.000 | 0.166 | -1.996 | 0.055 |
| caramel | 5 | 83 | 0.169 | 0.377 | 0.000 | 0.090 | 0.000 | 0.000 | 1.000 | 1.000 | 1.738 | 1.033 | 0.041 |
| peanutyalmondy | 6 | 83 | 0.169 | 0.377 | 0.000 | 0.090 | 0.000 | 0.000 | 1.000 | 1.000 | 1.738 | 1.033 | 0.041 |
| nougat | 7 | 83 | 0.084 | 0.280 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 2.938 | 6.711 | 0.031 |
| crispedricewafer | 8 | 83 | 0.084 | 0.280 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 2.938 | 6.711 | 0.031 |
| hard | 9 | 83 | 0.181 | 0.387 | 0.000 | 0.104 | 0.000 | 0.000 | 1.000 | 1.000 | 1.630 | 0.664 | 0.042 |
| bar | 10 | 83 | 0.253 | 0.437 | 0.000 | 0.194 | 0.000 | 0.000 | 1.000 | 1.000 | 1.116 | -0.764 | 0.048 |
| pluribus | 11 | 83 | 0.530 | 0.502 | 1.000 | 0.537 | 0.000 | 0.000 | 1.000 | 1.000 | -0.119 | -2.010 | 0.055 |
| sugarpercent | 12 | 83 | 0.472 | 0.286 | 0.446 | 0.469 | 0.375 | 0.000 | 0.988 | 0.988 | 0.097 | -1.183 | 0.031 |
| pricepercent | 13 | 83 | 0.464 | 0.289 | 0.458 | 0.459 | 0.322 | 0.000 | 0.976 | 0.976 | 0.127 | -1.201 | 0.032 |
| winpercent | 14 | 83 | 50.585 | 14.749 | 48.983 | 50.102 | 15.202 | 22.445 | 84.180 | 61.735 | 0.295 | -0.689 | 1.619 |
corrplot(as.matrix(cor(na.omit(data_num))))
# corrplot for presentation
data_num_deutsch <- subset(data_num, select = -c(sugarpercent, pricepercent))
colnames(data_num_deutsch) <- c(
"Schokolade",
"Frucht",
"Karamell",
"Nuss",
"Nougat",
"Keksartig",
"Hart",
"Riegel",
"Mehrteilig",
"Beliebtheit"
)
corrplot(as.matrix(cor(na.omit(data_num_deutsch))), tl.col = "black", tl.cex = 1.2, tl.srt = 50)
High negative correlation between chocolate and fruity. High positive correlation between chocolate and winpercent. Negative correlation between pluribus and bar and bar and fruity. Also positive correlation of pricepercent and winpercent
plot(data$winpercent, data$sugarpercent)
plot(data$winpercent, data$pricepercent)
plot(data$winpercent, data$fruity)
plot(data$winpercent, data$crispedricewafer)
plot(data$winpercent, data$chocolate)
plot(data$winpercent, data$nougat)
plot(data$winpercent, data$caramel)
plot(data$winpercent, data$pluribus)
plot(data$winpercent, data$bar)
plot(data$winpercent, data$hard)
Producer frequencies
producer_freq <- table(data$producer)
producer_freq
##
## American Licorice Company Ferrara Haribo
## 1 20 4
## Hershey Impact Just Born
## 17 1 1
## Mars Mondelez Perfetti
## 14 4 1
## Several Spangler Storck
## 2 1 1
## SweetWorks Tootsie Topps
## 1 11 1
## Washburn Welchs Zeta
## 1 1 1
# save for use in presentation
write.xlsx(producer_freq, here::here("output", "2021", "producer_freq.xlsx"))
We start with an easy linear model with only characteristica.
mod_lm <- lm(winpercent ~ ., data_num)
summary(mod_lm)
##
## Call:
## lm(formula = winpercent ~ ., data = data_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.4077 -6.0740 0.8284 6.7131 23.8284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.3272 5.0052 6.459 1.14e-08 ***
## chocolate 21.2291 4.1161 5.158 2.17e-06 ***
## fruity 11.0000 4.0658 2.705 0.00853 **
## caramel 2.7913 3.6848 0.758 0.45125
## peanutyalmondy 10.6861 3.6418 2.934 0.00450 **
## nougat 0.3781 5.7094 0.066 0.94739
## crispedricewafer 8.8919 5.2566 1.692 0.09511 .
## hard -5.8652 3.4633 -1.694 0.09474 .
## bar 1.7567 5.1490 0.341 0.73398
## pluribus 0.2133 3.1882 0.067 0.94685
## sugarpercent 9.8302 4.5527 2.159 0.03421 *
## pricepercent -7.3997 5.5215 -1.340 0.18446
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.68 on 71 degrees of freedom
## Multiple R-squared: 0.5461, Adjusted R-squared: 0.4758
## F-statistic: 7.766 on 11 and 71 DF, p-value: 1.222e-08
vif(mod_lm)
## chocolate fruity caramel peanutyalmondy
## 3.046647 2.986683 1.385823 1.353677
## nougat crispedricewafer hard bar
## 1.832239 1.553142 1.292665 3.647083
## pluribus sugarpercent pricepercent
## 1.842924 1.223164 1.825270
resettest(mod_lm)
##
## RESET test
##
## data: mod_lm
## RESET = 3.1783, df1 = 2, df2 = 69, p-value = 0.04782
raintest(mod_lm)
##
## Rainbow test
##
## data: mod_lm
## Rain = 2.0262, df1 = 42, df2 = 29, p-value = 0.02437
bptest(mod_lm)
##
## studentized Breusch-Pagan test
##
## data: mod_lm
## BP = 15.488, df = 11, p-value = 0.1612
durbinWatsonTest(mod_lm)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1206196 1.729298 0.124
## Alternative hypothesis: rho != 0
plot(mod_lm)
Adjusted R-squared of 0.4757844 is reasonable. Additionally, significant values with high effect sizes for chocolate (+), fruity (+), peanutyalmondy (+), crispedricewafer (+), hard (-) and sugarpercent (+). Negative coefficient for pricepercent seems plausible. Linearity assumptions more or less given. VIF okay. No heteroscedasticity. But raintest for linearity fails. Some variables with high leverage (Hershey Whoppers and Snickers Crisp), but could just be due to low sample size. Resettest gives a hint that model might be wrongly specified, hence add more controls.
mod_lm_sq <- lm(winpercent ~ . + I(pricepercent^2) + I(sugarpercent^2), data_num)
summary(mod_lm_sq)
##
## Call:
## lm(formula = winpercent ~ . + I(pricepercent^2) + I(sugarpercent^2),
## data = data_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.3474 -5.4761 0.0792 7.6808 22.4534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.8062 5.6632 5.793 1.88e-07 ***
## chocolate 21.1224 4.0689 5.191 2.01e-06 ***
## fruity 10.8851 4.0321 2.700 0.00872 **
## caramel 2.3182 3.6504 0.635 0.52750
## peanutyalmondy 9.9475 3.6652 2.714 0.00839 **
## nougat -0.2775 5.7985 -0.048 0.96197
## crispedricewafer 8.7767 5.1994 1.688 0.09592 .
## hard -4.8142 3.4880 -1.380 0.17198
## bar 4.1663 5.2421 0.795 0.42947
## pluribus 0.3409 3.1521 0.108 0.91418
## sugarpercent -19.7129 18.4106 -1.071 0.28802
## pricepercent 17.0319 18.3694 0.927 0.35706
## I(pricepercent^2) -23.0490 17.5241 -1.315 0.19277
## I(sugarpercent^2) 27.9375 17.6441 1.583 0.11791
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.55 on 69 degrees of freedom
## Multiple R-squared: 0.5691, Adjusted R-squared: 0.4879
## F-statistic: 7.01 on 13 and 69 DF, p-value: 2.016e-08
vif(mod_lm_sq)
## chocolate fruity caramel peanutyalmondy
## 3.047612 3.006811 1.392179 1.403506
## nougat crispedricewafer hard bar
## 1.934621 1.555489 1.342147 3.869638
## pluribus sugarpercent pricepercent I(pricepercent^2)
## 1.843996 20.475311 20.680398 18.820574
## I(sugarpercent^2)
## 19.071573
resettest(mod_lm_sq)
##
## RESET test
##
## data: mod_lm_sq
## RESET = 2.9699, df1 = 2, df2 = 67, p-value = 0.0581
raintest(mod_lm_sq)
##
## Rainbow test
##
## data: mod_lm_sq
## Rain = 2.3585, df1 = 42, df2 = 27, p-value = 0.01038
bptest(mod_lm_sq)
##
## studentized Breusch-Pagan test
##
## data: mod_lm_sq
## BP = 15.73, df = 13, p-value = 0.264
durbinWatsonTest(mod_lm_sq)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1627482 1.64358 0.05
## Alternative hypothesis: rho != 0
plot(mod_lm_sq)
The signs of the quadratic terms might give a hint that the marginal effects are decreasing. But as their effect sizes are really small, not significant and the plots against the dependent variable (see response functions) do not give any hints, we should not overinterpret the results. Model specification test did not get better.
Do normal lm with interaction. Note that we do no interaction with sugarpercent and pricepercent as this would be too hard to interpret and blow the model up. Also note that we do only consider interaction of order 2 as higher orders are not available in the data set.
mod_lm_ia <- lm(paste0("winpercent~(", characteristica, ")^2+."), data_num)
summary(mod_lm_ia)
##
## Call:
## lm(formula = paste0("winpercent~(", characteristica, ")^2+."),
## data = data_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.323 -4.555 0.000 5.123 22.184
##
## Coefficients: (18 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.8396 14.9411 2.332 0.0235 *
## chocolate 9.5774 18.1869 0.527 0.6007
## fruity 7.5397 15.7288 0.479 0.6337
## caramel -3.9759 11.6501 -0.341 0.7342
## peanutyalmondy 3.6618 17.3984 0.210 0.8341
## nougat 8.4206 28.4788 0.296 0.7686
## crispedricewafer -2.1027 10.9835 -0.191 0.8489
## hard -11.0643 13.6241 -0.812 0.4204
## bar 11.2866 10.9860 1.027 0.3089
## pluribus -0.7438 14.1169 -0.053 0.9582
## sugarpercent 10.0728 4.5903 2.194 0.0326 *
## pricepercent -8.4200 5.4547 -1.544 0.1286
## chocolate:fruity -0.3433 21.0781 -0.016 0.9871
## chocolate:caramel 14.4037 13.8548 1.040 0.3032
## chocolate:peanutyalmondy 30.1905 13.4085 2.252 0.0285 *
## chocolate:nougat -12.5211 29.6417 -0.422 0.6744
## chocolate:crispedricewafer NA NA NA NA
## chocolate:hard NA NA NA NA
## chocolate:bar NA NA NA NA
## chocolate:pluribus 6.3170 17.7014 0.357 0.7226
## fruity:caramel -7.1948 15.6814 -0.459 0.6482
## fruity:peanutyalmondy NA NA NA NA
## fruity:nougat NA NA NA NA
## fruity:crispedricewafer NA NA NA NA
## fruity:hard 5.1245 12.1454 0.422 0.6748
## fruity:bar NA NA NA NA
## fruity:pluribus 3.7290 14.8375 0.251 0.8025
## caramel:peanutyalmondy -21.6567 14.1757 -1.528 0.1325
## caramel:nougat 7.1037 14.8120 0.480 0.6335
## caramel:crispedricewafer -2.0528 14.1557 -0.145 0.8853
## caramel:hard 22.6575 19.4727 1.164 0.2498
## caramel:bar -4.2842 13.6508 -0.314 0.7549
## caramel:pluribus NA NA NA NA
## peanutyalmondy:nougat 18.6273 15.6040 1.194 0.2379
## peanutyalmondy:crispedricewafer NA NA NA NA
## peanutyalmondy:hard NA NA NA NA
## peanutyalmondy:bar -28.6385 13.5621 -2.112 0.0394 *
## peanutyalmondy:pluribus -12.9916 13.2823 -0.978 0.3325
## nougat:crispedricewafer NA NA NA NA
## nougat:hard NA NA NA NA
## nougat:bar NA NA NA NA
## nougat:pluribus NA NA NA NA
## crispedricewafer:hard NA NA NA NA
## crispedricewafer:bar 17.7104 13.3324 1.328 0.1897
## crispedricewafer:pluribus NA NA NA NA
## hard:bar NA NA NA NA
## hard:pluribus -0.1021 7.5475 -0.014 0.9893
## bar:pluribus NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.761 on 53 degrees of freedom
## Multiple R-squared: 0.7169, Adjusted R-squared: 0.562
## F-statistic: 4.628 on 29 and 53 DF, p-value: 6.99e-07
# note: cannot calculate vif as "there are aliased coefficients in the model", hence remove linear dependent
ld_vars <- attributes(alias(mod_lm_ia)$Complete)$dimnames[[1]]
mod_lm_ia2 <- lm(paste(paste0("winpercent~(", characteristica, ")^2+."), paste(ld_vars, collapse = "-"), sep = "-"), data_num)
summary(mod_lm_ia2)
##
## Call:
## lm(formula = paste(paste0("winpercent~(", characteristica, ")^2+."),
## paste(ld_vars, collapse = "-"), sep = "-"), data = data_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.323 -4.555 0.000 5.123 22.184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.8396 14.9411 2.332 0.0235 *
## chocolate 9.5774 18.1869 0.527 0.6007
## fruity 7.5397 15.7288 0.479 0.6337
## caramel -3.9759 11.6501 -0.341 0.7342
## peanutyalmondy 3.6618 17.3984 0.210 0.8341
## nougat 8.4206 28.4788 0.296 0.7686
## crispedricewafer -2.1027 10.9835 -0.191 0.8489
## hard -11.0643 13.6241 -0.812 0.4204
## bar 11.2866 10.9860 1.027 0.3089
## pluribus -0.7438 14.1169 -0.053 0.9582
## sugarpercent 10.0728 4.5903 2.194 0.0326 *
## pricepercent -8.4200 5.4547 -1.544 0.1286
## chocolate:fruity -0.3433 21.0781 -0.016 0.9871
## chocolate:caramel 14.4037 13.8548 1.040 0.3032
## chocolate:peanutyalmondy 30.1905 13.4085 2.252 0.0285 *
## chocolate:nougat -12.5211 29.6417 -0.422 0.6744
## chocolate:pluribus 6.3170 17.7014 0.357 0.7226
## fruity:caramel -7.1948 15.6814 -0.459 0.6482
## fruity:hard 5.1245 12.1454 0.422 0.6748
## fruity:pluribus 3.7290 14.8375 0.251 0.8025
## caramel:peanutyalmondy -21.6567 14.1757 -1.528 0.1325
## caramel:nougat 7.1037 14.8120 0.480 0.6335
## caramel:crispedricewafer -2.0528 14.1557 -0.145 0.8853
## caramel:hard 22.6575 19.4727 1.164 0.2498
## caramel:bar -4.2842 13.6508 -0.314 0.7549
## peanutyalmondy:nougat 18.6273 15.6040 1.194 0.2379
## peanutyalmondy:bar -28.6385 13.5621 -2.112 0.0394 *
## peanutyalmondy:pluribus -12.9916 13.2823 -0.978 0.3325
## crispedricewafer:bar 17.7104 13.3324 1.328 0.1897
## hard:pluribus -0.1021 7.5475 -0.014 0.9893
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.761 on 53 degrees of freedom
## Multiple R-squared: 0.7169, Adjusted R-squared: 0.562
## F-statistic: 4.628 on 29 and 53 DF, p-value: 6.99e-07
resettest(mod_lm_ia2)
##
## RESET test
##
## data: mod_lm_ia2
## RESET = 1.0939, df1 = 2, df2 = 51, p-value = 0.3426
raintest(mod_lm_ia2)
##
## Rainbow test
##
## data: mod_lm_ia2
## Rain = 0.52355, df1 = 42, df2 = 11, p-value = 0.9344
bptest(mod_lm_ia2)
##
## studentized Breusch-Pagan test
##
## data: mod_lm_ia2
## BP = 27.95, df = 29, p-value = 0.5206
durbinWatsonTest(mod_lm_ia2)
## lag Autocorrelation D-W Statistic p-value
## 1 0.04746414 1.890732 0.466
## Alternative hypothesis: rho != 0
vif(mod_lm_ia2)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## chocolate fruity caramel
## 71.191685 53.498399 16.580145
## peanutyalmondy nougat crispedricewafer
## 36.978610 54.564221 8.116112
## hard bar pluribus
## 23.942560 19.872215 43.246644
## sugarpercent pricepercent chocolate:fruity
## 1.488276 2.132169 4.607145
## chocolate:caramel chocolate:peanutyalmondy chocolate:nougat
## 17.720456 19.371192 51.333575
## chocolate:pluribus fruity:caramel fruity:hard
## 33.760596 2.549970 16.975454
## fruity:pluribus caramel:peanutyalmondy caramel:nougat
## 41.259431 6.098930 8.767363
## caramel:crispedricewafer caramel:hard caramel:bar
## 6.081766 3.932069 14.138982
## peanutyalmondy:nougat peanutyalmondy:bar peanutyalmondy:pluribus
## 7.389826 12.374338 7.049923
## crispedricewafer:bar hard:pluribus
## 10.385179 4.322207
Could be that chocolate and peanutyalmondy as well as as peanutyalmondy and bar is very good. But better do a double lasso because of colinearity problem which still exists as can be seen in the VIFs.
rs <- rlasso(paste0("winpercent~(", characteristica, ")^2+."), data_num)
selected <- which(coef(rs)[-c(1:1)] != 0) # = Select relevant variables = #
formula <- paste(c("winpercent ~", names(selected)), collapse = "+")
mod_lm_ia_lasso <- lm(formula, data = data_num)
summary(mod_lm_ia_lasso)
##
## Call:
## lm(formula = formula, data = data_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.5600 -7.7555 -0.0628 8.3021 24.7670
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.271 1.622 26.059 < 2e-16 ***
## chocolate 15.011 2.734 5.491 4.58e-07 ***
## chocolate:peanutyalmondy 11.222 3.864 2.904 0.00476 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11 on 80 degrees of freedom
## Multiple R-squared: 0.4571, Adjusted R-squared: 0.4436
## F-statistic: 33.68 on 2 and 80 DF, p-value: 2.441e-11
resettest(mod_lm_ia_lasso)
##
## RESET test
##
## data: mod_lm_ia_lasso
## RESET = 0, df1 = 2, df2 = 78, p-value = 1
raintest(mod_lm_ia_lasso)
##
## Rainbow test
##
## data: mod_lm_ia_lasso
## Rain = 1.1539, df1 = 42, df2 = 38, p-value = 0.3288
bptest(mod_lm_ia_lasso)
##
## studentized Breusch-Pagan test
##
## data: mod_lm_ia_lasso
## BP = 1.3067, df = 2, p-value = 0.5203
durbinWatsonTest(mod_lm_ia_lasso)
## lag Autocorrelation D-W Statistic p-value
## 1 0.07416245 1.835764 0.398
## Alternative hypothesis: rho != 0
vif(mod_lm_ia_lasso)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## chocolate chocolate:peanutyalmondy
## 1.266024 1.266024
plot(mod_lm_ia_lasso)
Chocolate and peanutyalmondy is definitely a good idea! Besides, Lasso gives a well specified model.
Note that cluster sizes of one are okay, c.f. https://stats.stackexchange.com/questions/388937/minimum-sample-size-per-cluster-in-a-random-effect-model We do not consider random slope models as the sample size is too few for this.
glm_data <- subset(data, select = -c(competitorname))
# get covariates
data_tmp <- subset(glm_data, select = -c(producer, winpercent))
all_covariates <- colnames(data_tmp)
# build formula
formula <- paste(c("winpercent~(1|producer)", all_covariates), collapse = "+")
# do glm
mod_glm <- lmer(formula, data = glm_data, control = lmerControl(optimizer = "bobyqa"))
summary(mod_glm)
## Linear mixed model fit by REML ['lmerMod']
## Formula: winpercent ~ (1 | producer) + chocolate + fruity + caramel +
## peanutyalmondy + nougat + crispedricewafer + hard + bar +
## pluribus + sugarpercent + pricepercent
## Data: glm_data
## Control: lmerControl(optimizer = "bobyqa")
##
## REML criterion at convergence: 554.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.21056 -0.49201 0.00977 0.53617 1.86012
##
## Random effects:
## Groups Name Variance Std.Dev.
## producer (Intercept) 31.21 5.587
## Residual 85.06 9.223
## Number of obs: 83, groups: producer, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 35.08133 4.82011 7.278
## chocolate 19.92717 3.80867 5.232
## fruity 8.88721 3.75832 2.365
## caramel 1.99811 3.33333 0.599
## peanutyalmondy 7.55000 3.29266 2.293
## nougat -0.81961 5.01376 -0.163
## crispedricewafer 7.09682 4.60949 1.540
## hard -3.90837 3.33892 -1.171
## bar -0.02138 4.60408 -0.005
## pluribus -0.86395 2.98638 -0.289
## sugarpercent 8.46224 4.22019 2.005
## pricepercent -7.54571 5.14366 -1.467
##
## Correlation of Fixed Effects:
## (Intr) choclt fruity caraml pntylm nougat crspdr hard bar
## chocolate -0.462
## fruity -0.671 0.579
## caramel -0.229 0.136 0.272
## peantylmndy -0.196 -0.032 0.227 0.121
## nougat -0.032 0.039 0.040 -0.189 -0.089
## crispdrcwfr 0.019 -0.092 0.017 -0.143 0.181 0.384
## hard -0.154 -0.023 -0.171 0.022 0.085 0.002 0.037
## bar -0.240 -0.135 0.126 0.090 0.130 -0.483 -0.312 0.109
## pluribus -0.500 0.074 0.173 0.201 0.186 -0.033 0.004 0.194 0.522
## sugarpercnt -0.197 0.046 -0.007 -0.122 -0.027 -0.071 -0.019 -0.185 0.075
## pricepercnt -0.126 -0.153 -0.025 -0.116 -0.166 0.153 -0.060 0.063 -0.345
## plurbs sgrprc
## chocolate
## fruity
## caramel
## peantylmndy
## nougat
## crispdrcwfr
## hard
## bar
## pluribus
## sugarpercnt -0.084
## pricepercnt -0.162 -0.347
r.squaredGLMM(mod_glm)
## R2m R2c
## [1,] 0.423914 0.5785676
Check validity of random intercepts
# ICC
icc_output <- performance::icc(mod_glm)
as.data.frame(icc_output)
| ICC_adjusted | ICC_unadjusted | optional |
|---|---|---|
| 0.268 | 0.155 | FALSE |
# cAIC
cAIC(mod_lm)
##
## Conditional log-likelihood: -307.86
## Degrees of freedom: 13.00
## Conditional Akaike information criterion: 641.71
cAIC(mod_glm)
##
## Conditional log-likelihood: -292.79
## Degrees of freedom: 20.31
## Conditional Akaike information criterion: 626.19
# inspect random intercepts
ranef(mod_glm)
## $producer
## (Intercept)
## American Licorice Company -2.6120017
## Ferrara -1.5794195
## Haribo 2.9841001
## Hershey 4.1525747
## Impact -0.1986142
## Just Born -0.4482936
## Mars 8.1069462
## Mondelez 3.5070438
## Perfetti 1.2200314
## Several -2.0302559
## Spangler -1.7543782
## Storck 2.5011972
## SweetWorks -5.5055836
## Tootsie -6.8833535
## Topps -0.9701245
## Washburn -1.6826901
## Welchs 0.2944472
## Zeta 0.8983742
##
## with conditional variances for "producer"
ICC above 0.10, random intercepts differ, cAIC for random intercept model is smaller, hence random intercept model valid and has better pseudo R-squared. Results robust with random intercepts but effect sizes got smaller. Also nougat, bar and pluribus went negative, so do not produce it like this!
mod_wo_sugar <- lm(winpercent ~ ., robust_num)
summary(mod_wo_sugar)
##
## Call:
## lm(formula = winpercent ~ ., data = robust_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.3393 -5.7765 0.2818 6.7984 21.3206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.2670 5.0476 6.789 2.7e-09 ***
## chocolate 21.2146 4.2195 5.028 3.5e-06 ***
## fruity 11.3362 4.1649 2.722 0.00814 **
## caramel 4.1148 3.7248 1.105 0.27296
## peanutyalmondy 10.9349 3.7314 2.930 0.00453 **
## nougat 1.7557 5.8161 0.302 0.76362
## crispedricewafer 9.1631 5.3871 1.701 0.09327 .
## hard -4.6263 3.5013 -1.321 0.19058
## bar 0.9368 5.2639 0.178 0.85925
## pluribus 0.8337 3.2550 0.256 0.79859
## pricepercent -3.5146 5.3512 -0.657 0.51341
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.95 on 72 degrees of freedom
## Multiple R-squared: 0.5163, Adjusted R-squared: 0.4491
## F-statistic: 7.685 on 10 and 72 DF, p-value: 3.178e-08
resettest(mod_wo_sugar)
##
## RESET test
##
## data: mod_wo_sugar
## RESET = 2.894, df1 = 2, df2 = 70, p-value = 0.06201
raintest(mod_wo_sugar)
##
## Rainbow test
##
## data: mod_wo_sugar
## Rain = 1.4175, df1 = 42, df2 = 30, p-value = 0.1602
bptest(mod_wo_sugar)
##
## studentized Breusch-Pagan test
##
## data: mod_wo_sugar
## BP = 10.235, df = 10, p-value = 0.4201
durbinWatsonTest(mod_wo_sugar)
## lag Autocorrelation D-W Statistic p-value
## 1 0.08178324 1.816866 0.312
## Alternative hypothesis: rho != 0
vif(mod_wo_sugar)
## chocolate fruity caramel peanutyalmondy
## 3.046639 2.982304 1.347473 1.352322
## nougat crispedricewafer hard bar
## 1.809357 1.552256 1.257185 3.627245
## pluribus pricepercent
## 1.827957 1.631432
plot(mod_wo_sugar)
The previous results for the linear model seem to be still robust. Only hard is not significant any more. But the model specification tests are better! Next look at the quadratic terms.
mod_lm_sq_wo_sugar <- lm(winpercent ~ . + I(pricepercent^2), robust_num)
summary(mod_lm_sq_wo_sugar)
##
## Call:
## lm(formula = winpercent ~ . + I(pricepercent^2), data = robust_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.235 -6.556 1.140 7.343 21.384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.5545 5.5187 5.537 4.89e-07 ***
## chocolate 21.1084 4.1766 5.054 3.24e-06 ***
## fruity 10.7972 4.1361 2.611 0.0110 *
## caramel 3.6349 3.6989 0.983 0.3291
## peanutyalmondy 9.8040 3.7615 2.606 0.0111 *
## nougat -0.4861 5.9279 -0.082 0.9349
## crispedricewafer 8.8070 5.3363 1.650 0.1033
## hard -3.8891 3.4964 -1.112 0.2698
## bar 2.2732 5.2777 0.431 0.6680
## pluribus 0.8172 3.2215 0.254 0.8005
## pricepercent 22.5695 17.3084 1.304 0.1965
## I(pricepercent^2) -27.1161 17.1302 -1.583 0.1179
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.83 on 71 degrees of freedom
## Multiple R-squared: 0.5328, Adjusted R-squared: 0.4604
## F-statistic: 7.361 on 11 and 71 DF, p-value: 3.071e-08
resettest(mod_lm_sq_wo_sugar)
##
## RESET test
##
## data: mod_lm_sq_wo_sugar
## RESET = 2.2925, df1 = 2, df2 = 69, p-value = 0.1087
raintest(mod_lm_sq_wo_sugar)
##
## Rainbow test
##
## data: mod_lm_sq_wo_sugar
## Rain = 1.8759, df1 = 42, df2 = 29, p-value = 0.03913
bptest(mod_lm_sq_wo_sugar)
##
## studentized Breusch-Pagan test
##
## data: mod_lm_sq_wo_sugar
## BP = 9.7106, df = 11, p-value = 0.5566
durbinWatsonTest(mod_lm_sq_wo_sugar)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1325029 1.718943 0.144
## Alternative hypothesis: rho != 0
vif(mod_lm_sq_wo_sugar)
## chocolate fruity caramel peanutyalmondy
## 3.047426 3.002648 1.356588 1.402931
## nougat crispedricewafer hard bar
## 1.918889 1.555018 1.279895 3.722510
## pluribus pricepercent I(pricepercent^2)
## 1.827976 17.424980 17.067684
plot(mod_lm_sq_wo_sugar)
Concerning adjusted R^2 and model specification test, this is the best model so far. Let us take a look at the random intercept model.
glm_data <- subset(robust, select = -c(competitorname))
# get covariates
data_tmp <- subset(glm_data, select = -c(producer, winpercent))
all_covariates <- colnames(data_tmp)
# build formula
formula <- paste(c("winpercent~(1|producer)", all_covariates), collapse = "+")
# do glm
mod_glm_wo_sugar <- lmer(formula, data = glm_data, control = lmerControl(optimizer = "bobyqa"))
summary(mod_glm_wo_sugar)
## Linear mixed model fit by REML ['lmerMod']
## Formula: winpercent ~ (1 | producer) + chocolate + fruity + caramel +
## peanutyalmondy + nougat + crispedricewafer + hard + bar +
## pluribus + pricepercent
## Data: glm_data
## Control: lmerControl(optimizer = "bobyqa")
##
## REML criterion at convergence: 563.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.33289 -0.48834 0.04583 0.58399 1.97023
##
## Random effects:
## Groups Name Variance Std.Dev.
## producer (Intercept) 31.64 5.625
## Residual 88.88 9.427
## Number of obs: 83, groups: producer, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 36.98003 4.82074 7.671
## chocolate 19.57678 3.88561 5.038
## fruity 8.95990 3.83723 2.335
## caramel 2.82292 3.38032 0.835
## peanutyalmondy 7.74821 3.36337 2.304
## nougat -0.08993 5.11131 -0.018
## crispedricewafer 7.28770 4.71046 1.547
## hard -2.69400 3.34803 -0.805
## bar -0.70627 4.69057 -0.151
## pluribus -0.35710 3.03803 -0.118
## pricepercent -3.94503 4.92509 -0.801
##
## Correlation of Fixed Effects:
## (Intr) choclt fruity caraml pntylm nougat crspdr hard bar
## chocolate -0.464
## fruity -0.687 0.580
## caramel -0.261 0.143 0.273
## peantylmndy -0.205 -0.032 0.227 0.119
## nougat -0.047 0.042 0.039 -0.200 -0.092
## crispdrcwfr 0.015 -0.091 0.017 -0.147 0.181 0.384
## hard -0.198 -0.014 -0.174 -0.001 0.081 -0.011 0.034
## bar -0.231 -0.139 0.127 0.100 0.131 -0.481 -0.312 0.126
## pluribus -0.529 0.079 0.173 0.193 0.184 -0.040 0.002 0.183 0.531
## pricepercnt -0.212 -0.146 -0.029 -0.170 -0.186 0.137 -0.071 -0.002 -0.341
## plurbs
## chocolate
## fruity
## caramel
## peantylmndy
## nougat
## crispdrcwfr
## hard
## bar
## pluribus
## pricepercnt -0.204
r.squaredGLMM(mod_glm_wo_sugar)
## R2m R2c
## [1,] 0.3937094 0.5528884
# ICC
icc_output <- performance::icc(mod_glm_wo_sugar)
as.data.frame(icc_output)
| ICC_adjusted | ICC_unadjusted | optional |
|---|---|---|
| 0.263 | 0.159 | FALSE |
# cAIC
cAIC(mod_glm_wo_sugar)
##
## Conditional log-likelihood: -295.13
## Degrees of freedom: 19.23
## Conditional Akaike information criterion: 628.72
cAIC(mod_glm_wo_sugar)
##
## Conditional log-likelihood: -295.13
## Degrees of freedom: 19.23
## Conditional Akaike information criterion: 628.72
# inspect random intercepts
ranef(mod_glm_wo_sugar)
## $producer
## (Intercept)
## American Licorice Company -2.049570340
## Ferrara -1.881137952
## Haribo 2.749819075
## Hershey 3.820180384
## Impact -0.999315229
## Just Born 0.542074150
## Mars 9.174189093
## Mondelez 2.171138661
## Perfetti 2.204811341
## Several -2.187326495
## Spangler -0.968874497
## Storck 1.521057479
## SweetWorks -5.564006997
## Tootsie -6.825687836
## Topps -1.090281154
## Washburn -1.046922866
## Welchs -0.004999566
## Zeta 0.434852750
##
## with conditional variances for "producer"
It is also robust. Last, check interaction terms. Do normal lm with interaction without sugarpercent:
mod_lm_ia_wo_sugar <- lm(paste0("winpercent~(", characteristica, ")^2+."), robust_num)
summary(mod_lm_ia_wo_sugar)
##
## Call:
## lm(formula = paste0("winpercent~(", characteristica, ")^2+."),
## data = robust_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.287 -5.030 0.000 5.086 19.716
##
## Coefficients: (18 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55554 15.45935 2.235 0.0296 *
## chocolate 10.89272 18.80816 0.579 0.5649
## fruity 10.35803 16.22061 0.639 0.5258
## caramel -0.85160 11.96425 -0.071 0.9435
## peanutyalmondy 1.45926 17.97258 0.081 0.9356
## nougat 17.04178 29.18591 0.584 0.5617
## crispedricewafer 2.11933 11.18917 0.189 0.8505
## hard -8.49354 14.04498 -0.605 0.5479
## bar 11.38077 11.36742 1.001 0.3212
## pluribus 3.30266 14.48198 0.228 0.8205
## pricepercent -4.70208 5.36497 -0.876 0.3847
## chocolate:fruity -0.27138 21.81002 -0.012 0.9901
## chocolate:caramel 13.07220 14.32210 0.913 0.3654
## chocolate:peanutyalmondy 35.15443 13.67527 2.571 0.0129 *
## chocolate:nougat -18.20246 30.55374 -0.596 0.5538
## chocolate:crispedricewafer NA NA NA NA
## chocolate:hard NA NA NA NA
## chocolate:bar NA NA NA NA
## chocolate:pluribus 2.61947 18.23292 0.144 0.8863
## fruity:caramel -8.07135 16.22061 -0.498 0.6208
## fruity:peanutyalmondy NA NA NA NA
## fruity:nougat NA NA NA NA
## fruity:crispedricewafer NA NA NA NA
## fruity:hard 3.41421 12.54127 0.272 0.7865
## fruity:bar NA NA NA NA
## fruity:pluribus 0.06859 15.25538 0.004 0.9964
## caramel:peanutyalmondy -21.43316 14.66755 -1.461 0.1497
## caramel:nougat 2.25620 15.15494 0.149 0.8822
## caramel:crispedricewafer -5.02959 14.57986 -0.345 0.7315
## caramel:hard 17.88360 20.02275 0.893 0.3757
## caramel:bar -0.67403 14.02180 -0.048 0.9618
## caramel:pluribus NA NA NA NA
## peanutyalmondy:nougat 16.51993 16.11517 1.025 0.3099
## peanutyalmondy:crispedricewafer NA NA NA NA
## peanutyalmondy:hard NA NA NA NA
## peanutyalmondy:bar -31.09163 13.98532 -2.223 0.0304 *
## peanutyalmondy:pluribus -13.52027 13.74120 -0.984 0.3295
## nougat:crispedricewafer NA NA NA NA
## nougat:hard NA NA NA NA
## nougat:bar NA NA NA NA
## nougat:pluribus NA NA NA NA
## crispedricewafer:hard NA NA NA NA
## crispedricewafer:bar 12.97754 13.61367 0.953 0.3447
## crispedricewafer:pluribus NA NA NA NA
## hard:bar NA NA NA NA
## hard:pluribus 0.62229 7.80205 0.080 0.9367
## bar:pluribus NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.1 on 54 degrees of freedom
## Multiple R-squared: 0.6912, Adjusted R-squared: 0.5311
## F-statistic: 4.317 on 28 and 54 DF, p-value: 2.063e-06
Same as before, but better do a double lasso because of colinearity problem.
rs <- rlasso(winpercent ~ .^2, robust_num)
selected <- which(coef(rs)[-c(1:1)] != 0) # = Select relevant variables = #
formula <- paste(c("winpercent ~", names(selected)), collapse = "+")
mod_lm_ia_lasso_wo_sugar <- lm(formula, data = data_num)
summary(mod_lm_ia_lasso_wo_sugar)
##
## Call:
## lm(formula = formula, data = data_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.6242 -7.7223 -0.3664 8.4296 24.7670
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.271 1.686 25.068 <2e-16 ***
## chocolate 12.270 5.339 2.298 0.0242 *
## chocolate:pricepercent 10.217 7.531 1.357 0.1787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.44 on 80 degrees of freedom
## Multiple R-squared: 0.4134, Adjusted R-squared: 0.3987
## F-statistic: 28.19 on 2 and 80 DF, p-value: 5.419e-10
vif(mod_lm_ia_lasso_wo_sugar)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## chocolate chocolate:pricepercent
## 4.468427 4.468427
plot(mod_lm_ia_lasso_wo_sugar)
Chocolate and peanutyalmondy is not important any more. But note that R-square is very low.
We want to find the candy with the highest winpercent. Given only the data, this is:
print(data[data$winpercent == max(data$winpercent), ])
## competitorname producer chocolate fruity caramel peanutyalmondy
## <char> <char> <int> <int> <int> <int>
## 1: ReeseÕs Peanut Butter cup Hershey 1 0 0 1
## nougat crispedricewafer hard bar pluribus sugarpercent pricepercent
## <int> <int> <int> <int> <int> <num> <num>
## 1: 0 0 0 0 0 0.7108434 0.6385542
## winpercent
## <num>
## 1: 84.18029
Now we can check if there is a better solution by using the lm-model and inventing observations. We use a grid for that.
inventions <- read.csv(text = paste(colnames(data_num), collapse = ","))
inventions$winpercent <- NULL
for (choc in seq(0, 1)) {
for (fruit in seq(0, 1)) {
for (caram in seq(0, 1)) {
for (peanut in seq(0, 1)) {
for (noug in seq(0, 1)) {
for (crisped in seq(0, 1)) {
for (har in seq(0, 1)) {
for (ba in seq(0, 1)) {
for (plur in seq(0, 1)) {
for (sugar in seq(0, 1, 0.2)) {
for (price in seq(0, 1, 0.2)) {
inventions[nrow(inventions) + 1, ] <- c(choc, fruit, caram, peanut, noug, crisped, har, ba, plur, sugar, price)
}
}
}
}
}
}
}
}
}
}
}
Now we do the prediction but we use a model without sugar in order to prevent unrealistic products as this variable might be wrong. Additionally, the model specification for models without sugar were better.We do not use the interaction model as due to colinearity the prediction might be wrong. In the end we use mod_lm_sq_wo_sugar as this has the best specification tests and adjusted R^2.
inventions_w_win <- inventions
# filter with row value <= 5, i.e. max(data$sum)
inventions_w_win$sum <- (
inventions_w_win$chocolate +
inventions_w_win$fruity +
inventions_w_win$caramel +
inventions_w_win$peanutyalmondy +
inventions_w_win$nougat +
inventions_w_win$crispedricewafer +
inventions_w_win$hard +
inventions_w_win$bar +
inventions_w_win$pluribus
)
inventions_w_win <- inventions_w_win[inventions_w_win$sum <= 5, ]
# predict
win <- predict(mod_lm_sq_wo_sugar, inventions_w_win)
inventions_w_win <- cbind(inventions_w_win, win)
inventions_w_win <- inventions_w_win[order(-win), ]
head(inventions_w_win, 20)
| chocolate | fruity | caramel | peanutyalmondy | nougat | crispedricewafer | hard | bar | pluribus | sugarpercent | pricepercent | sum | win |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0.4 | 5 | 89.4 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.2 | 0.4 | 5 | 89.4 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.4 | 0.4 | 5 | 89.4 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.6 | 0.4 | 5 | 89.4 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.8 | 0.4 | 5 | 89.4 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0.4 | 5 | 89.4 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0.6 | 5 | 88.5 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.2 | 0.6 | 5 | 88.5 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.4 | 0.6 | 5 | 88.5 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.6 | 0.6 | 5 | 88.5 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.8 | 0.6 | 5 | 88.5 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0.6 | 5 | 88.5 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0.2 | 5 | 88.1 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.2 | 0.2 | 5 | 88.1 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.4 | 0.2 | 5 | 88.1 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.6 | 0.2 | 5 | 88.1 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0.8 | 0.2 | 5 | 88.1 |
| 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0.2 | 5 | 88.1 |
| 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0.4 | 5 | 88 |
| 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0.2 | 0.4 | 5 | 88 |
write.xlsx(head(inventions_w_win, 20), here::here("output", "2021", "predictions.xlsx"))
Last, save the regression coefficients for use in the PPT
hux <- huxtablereg(
list(
mod_lm,
mod_lm_sq,
mod_lm_ia,
mod_lm_ia_lasso,
mod_glm,
mod_wo_sugar,
mod_lm_sq_wo_sugar
),
stars = numeric(0),
single.row = TRUE
)
quick_pptx(hux, file = here::here("output", "2021", "regs.pptx"))
We do some special isolated regression to have a better knowledge of certain combinations, e.g. fruit gums.
mod_fruit_gum <- lm(winpercent ~ pluribus * fruity, data_num)
summary(mod_fruit_gum)
##
## Call:
## lm(formula = winpercent ~ pluribus * fruity, data = data_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.634 -9.972 -1.069 9.538 24.621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.865 2.487 24.471 < 2e-16 ***
## pluribus -12.050 3.933 -3.064 0.00299 **
## fruity -19.614 4.484 -4.374 3.68e-05 ***
## pluribus:fruity 16.244 5.984 2.715 0.00815 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.92 on 79 degrees of freedom
## Multiple R-squared: 0.2603, Adjusted R-squared: 0.2322
## F-statistic: 9.265 on 3 and 79 DF, p-value: 2.545e-05
Both coefficients are negative, but together it seems okay. Consider that in the interaction model the fruity:pluribus coefficient was close to zero.